0.1 BE note:

Use rmarkdown::render("MSc_AS-SMM634-Group4-Project.rmd", output_dir = "docs") to render…

1 Initial processing

## dependencies / external librarys
library(dplyr)
library(ggplot2)
library(car)
library(MASS)

1.1 Read Data

rm(list = ls())
# CarDataRead.R

# source(file.path(".", "CarDataRead.R"))
df <- read.csv("car_price.csv") |>
  mutate(fsymboling = as.factor(symboling)) |>
  mutate(safety = case_match(
    symboling,
    c(-2, -1) ~ "<-1",
    0 ~ "0",
    1 ~ "1",
    2 ~ "2",
    3 ~ "3"
  )) |>
  mutate(safetyIncr = case_match(
    symboling,
    c(-2, -1) ~ "4",
    0 ~ "3",
    1 ~ "2",
    2 ~ "1",
    3 ~ "0"
  )) |>
  mutate(safetyIncr2 = case_match(
    symboling,
    0 ~ "Base",
    c(-2, -1) ~ "Safer",
    c(1, 2, 3) ~ "Riskier"
  )) |>
  mutate(cylinderNum = case_match(cylindernumber,
    c("two", "three") ~ "leq_three",
    c("eight", "twelve") ~ "geq_eight",
    .default = cylindernumber
  ))

table(df$cylinderNum)
## 
##      five      four geq_eight leq_three       six 
##        11       159         6         5        24

1.1.1 Clean data

Want to check for any typos, missing data, or NaN values.

1.1.2 Car manufacturer

Extracting car manufacturer from first part of CarName and correcting spelling errors.

# extract car manufacturer from first part of CarName
df$carManufacturer <- sapply(strsplit(df$CarName, " +"), `[`, 1)
# print
table(df$carManufacturer)
## 
## alfa-romero        audi         bmw       buick   chevrolet       dodge       honda 
##           3           7           8           8           3           9          13 
##       isuzu      jaguar       maxda       mazda     mercury  mitsubishi      nissan 
##           4           3           2          15           1          13          17 
##      Nissan     peugeot    plymouth    porcshce     porsche     renault        saab 
##           1          11           7           1           4           2           6 
##      subaru      toyota     toyouta   vokswagen  volkswagen       volvo          vw 
##          12          31           1           1           9          11           2

Our assumptions are as follows:

Typo / Mis-spelling Correct Spelling
maxda mazda
Nissan nissan
porcshce porsche
toyouta toyota
vokswagen volkswagen
vw volkswagen
df <- df %>% mutate(carManufacturer = case_match(carManufacturer,
  "maxda" ~ "mazda",
  "Nissan" ~ "nissan",
  "porcshce" ~ "porsche",
  "toyouta" ~ "toyota",
  c("vokswagen", "vw") ~ "volkswagen",
  .default = carManufacturer
))
table(df$carManufacturer)
## 
## alfa-romero        audi         bmw       buick   chevrolet       dodge       honda 
##           3           7           8           8           3           9          13 
##       isuzu      jaguar       mazda     mercury  mitsubishi      nissan     peugeot 
##           4           3          17           1          13          18          11 
##    plymouth     porsche     renault        saab      subaru      toyota  volkswagen 
##           7           5           2           6          12          32          12 
##       volvo 
##          11

1.1.3 NA value check

# check individual columns for NaN values
colSums(is.na(df))
##           car_ID        symboling          CarName         fueltype       aspiration 
##                0                0                0                0                0 
##       doornumber          carbody       drivewheel   enginelocation        wheelbase 
##                0                0                0                0                0 
##        carlength         carwidth        carheight       curbweight       enginetype 
##                0                0                0                0                0 
##   cylindernumber       enginesize       fuelsystem        boreratio           stroke 
##                0                0                0                0                0 
## compressionratio       horsepower          peakrpm          citympg       highwaympg 
##                0                0                0                0                0 
##            price       fsymboling           safety       safetyIncr      safetyIncr2 
##                0                0                0                0                0 
##      cylinderNum  carManufacturer 
##                0                0
# check total number of NaN values
cat(c("Total number of NaN values:", sum(colSums(is.na(df)))))
## Total number of NaN values: 0

Looks good - no NaN values.

1.2 Data examination & visualisation

1.2.1 Overview

summary(df)
##      car_ID      symboling         CarName            fueltype        
##  Min.   :  1   Min.   :-2.0000   Length:205         Length:205        
##  1st Qu.: 52   1st Qu.: 0.0000   Class :character   Class :character  
##  Median :103   Median : 1.0000   Mode  :character   Mode  :character  
##  Mean   :103   Mean   : 0.8341                                        
##  3rd Qu.:154   3rd Qu.: 2.0000                                        
##  Max.   :205   Max.   : 3.0000                                        
##   aspiration         doornumber          carbody           drivewheel       
##  Length:205         Length:205         Length:205         Length:205        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  enginelocation       wheelbase        carlength        carwidth       carheight    
##  Length:205         Min.   : 86.60   Min.   :141.1   Min.   :60.30   Min.   :47.80  
##  Class :character   1st Qu.: 94.50   1st Qu.:166.3   1st Qu.:64.10   1st Qu.:52.00  
##  Mode  :character   Median : 97.00   Median :173.2   Median :65.50   Median :54.10  
##                     Mean   : 98.76   Mean   :174.0   Mean   :65.91   Mean   :53.72  
##                     3rd Qu.:102.40   3rd Qu.:183.1   3rd Qu.:66.90   3rd Qu.:55.50  
##                     Max.   :120.90   Max.   :208.1   Max.   :72.30   Max.   :59.80  
##    curbweight    enginetype        cylindernumber       enginesize   
##  Min.   :1488   Length:205         Length:205         Min.   : 61.0  
##  1st Qu.:2145   Class :character   Class :character   1st Qu.: 97.0  
##  Median :2414   Mode  :character   Mode  :character   Median :120.0  
##  Mean   :2556                                         Mean   :126.9  
##  3rd Qu.:2935                                         3rd Qu.:141.0  
##  Max.   :4066                                         Max.   :326.0  
##   fuelsystem          boreratio        stroke      compressionratio   horsepower   
##  Length:205         Min.   :2.54   Min.   :2.070   Min.   : 7.00    Min.   : 48.0  
##  Class :character   1st Qu.:3.15   1st Qu.:3.110   1st Qu.: 8.60    1st Qu.: 70.0  
##  Mode  :character   Median :3.31   Median :3.290   Median : 9.00    Median : 95.0  
##                     Mean   :3.33   Mean   :3.255   Mean   :10.14    Mean   :104.1  
##                     3rd Qu.:3.58   3rd Qu.:3.410   3rd Qu.: 9.40    3rd Qu.:116.0  
##                     Max.   :3.94   Max.   :4.170   Max.   :23.00    Max.   :288.0  
##     peakrpm        citympg        highwaympg        price       fsymboling
##  Min.   :4150   Min.   :13.00   Min.   :16.00   Min.   : 5118   -2: 3     
##  1st Qu.:4800   1st Qu.:19.00   1st Qu.:25.00   1st Qu.: 7788   -1:22     
##  Median :5200   Median :24.00   Median :30.00   Median :10295   0 :67     
##  Mean   :5125   Mean   :25.22   Mean   :30.75   Mean   :13277   1 :54     
##  3rd Qu.:5500   3rd Qu.:30.00   3rd Qu.:34.00   3rd Qu.:16503   2 :32     
##  Max.   :6600   Max.   :49.00   Max.   :54.00   Max.   :45400   3 :27     
##     safety           safetyIncr        safetyIncr2        cylinderNum       
##  Length:205         Length:205         Length:205         Length:205        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  carManufacturer   
##  Length:205        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(df)
## 'data.frame':    205 obs. of  32 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ CarName         : chr  "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
##  $ fueltype        : chr  "gas" "gas" "gas" "gas" ...
##  $ aspiration      : chr  "std" "std" "std" "std" ...
##  $ doornumber      : chr  "two" "two" "two" "four" ...
##  $ carbody         : chr  "convertible" "convertible" "hatchback" "sedan" ...
##  $ drivewheel      : chr  "rwd" "rwd" "rwd" "fwd" ...
##  $ enginelocation  : chr  "front" "front" "front" "front" ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : chr  "dohc" "dohc" "ohcv" "ohc" ...
##  $ cylindernumber  : chr  "four" "four" "six" "four" ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : chr  "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...
##  $ fsymboling      : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
##  $ safety          : chr  "3" "3" "1" "2" ...
##  $ safetyIncr      : chr  "0" "0" "2" "1" ...
##  $ safetyIncr2     : chr  "Riskier" "Riskier" "Riskier" "Riskier" ...
##  $ cylinderNum     : chr  "four" "four" "six" "four" ...
##  $ carManufacturer : chr  "alfa-romero" "alfa-romero" "alfa-romero" "audi" ...

1.2.2 Factor variables: General

table(df$symboling)
## 
## -2 -1  0  1  2  3 
##  3 22 67 54 32 27
table(df$safety)
## 
## <-1   0   1   2   3 
##  25  67  54  32  27
table(df$safetyIncr)
## 
##  0  1  2  3  4 
## 27 32 54 67 25
table(df$cylindernumber)
## 
##  eight   five   four    six  three twelve    two 
##      5     11    159     24      1      1      4
table(df$cylinderNum)
## 
##      five      four geq_eight leq_three       six 
##        11       159         6         5        24
# factor_cols <- c("symboling", "safety", "safetyIncr", "cylindernumber", "cylinderNum","carbody")
# examine categorical fields
factor_cols<- c(
  "symboling",
  "safety",
  "safetyIncr",
  "safetyIncr2",
  "cylindernumber",
  "cylinderNum",
  "carbody",
  "fueltype",
  "aspiration",
  "doornumber",
  "carbody",
  "drivewheel",
  "enginelocation", # BE note: 202:3 split so don't include in model
  "enginetype",
  "cylindernumber",
  "fuelsystem",
  "carManufacturer"
)

cols_use <- unique(intersect(factor_cols, names(df)))
fc_missing <- setdiff(unique(factor_cols), names(df))
if (length(fc_missing)) {
  message(
    "Missing in df (skipped): ",
    paste(fc_missing, collapse = ", ")
  )
}
tab_list <- setNames(
  lapply(cols_use, function(v) table(df[[v]], useNA = "ifany")),
  # lapply(cols_use, function(v) prop.table(table(df[[v]], useNA = "ifany"))),
  cols_use
)
for (nm in names(tab_list)) {
  cat("\n==== ", nm, " ====\n", sep = "")
  print(tab_list[[nm]])
}
## 
## ==== symboling ====
## 
## -2 -1  0  1  2  3 
##  3 22 67 54 32 27 
## 
## ==== safety ====
## 
## <-1   0   1   2   3 
##  25  67  54  32  27 
## 
## ==== safetyIncr ====
## 
##  0  1  2  3  4 
## 27 32 54 67 25 
## 
## ==== safetyIncr2 ====
## 
##    Base Riskier   Safer 
##      67     113      25 
## 
## ==== cylindernumber ====
## 
##  eight   five   four    six  three twelve    two 
##      5     11    159     24      1      1      4 
## 
## ==== cylinderNum ====
## 
##      five      four geq_eight leq_three       six 
##        11       159         6         5        24 
## 
## ==== carbody ====
## 
## convertible     hardtop   hatchback       sedan       wagon 
##           6           8          70          96          25 
## 
## ==== fueltype ====
## 
## diesel    gas 
##     20    185 
## 
## ==== aspiration ====
## 
##   std turbo 
##   168    37 
## 
## ==== doornumber ====
## 
## four  two 
##  115   90 
## 
## ==== drivewheel ====
## 
## 4wd fwd rwd 
##   9 120  76 
## 
## ==== enginelocation ====
## 
## front  rear 
##   202     3 
## 
## ==== enginetype ====
## 
##  dohc dohcv     l   ohc  ohcf  ohcv rotor 
##    12     1    12   148    15    13     4 
## 
## ==== fuelsystem ====
## 
## 1bbl 2bbl 4bbl  idi  mfi mpfi spdi spfi 
##   11   66    3   20    1   94    9    1 
## 
## ==== carManufacturer ====
## 
## alfa-romero        audi         bmw       buick   chevrolet       dodge       honda 
##           3           7           8           8           3           9          13 
##       isuzu      jaguar       mazda     mercury  mitsubishi      nissan     peugeot 
##           4           3          17           1          13          18          11 
##    plymouth     porsche     renault        saab      subaru      toyota  volkswagen 
##           7           5           2           6          12          32          12 
##       volvo 
##          11
df <- df %>%
  mutate(across(all_of(factor_cols), as.factor))

1.2.3 Factor variables: check stored as factors in R

str(df)
## 'data.frame':    205 obs. of  32 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
##  $ CarName         : chr  "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
##  $ fueltype        : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ aspiration      : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
##  $ doornumber      : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
##  $ carbody         : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
##  $ drivewheel      : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
##  $ enginelocation  : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
##  $ cylindernumber  : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...
##  $ fsymboling      : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
##  $ safety          : Factor w/ 5 levels "<-1","0","1",..: 5 5 3 4 4 4 3 3 3 2 ...
##  $ safetyIncr      : Factor w/ 5 levels "0","1","2","3",..: 1 1 3 2 2 2 3 3 3 4 ...
##  $ safetyIncr2     : Factor w/ 3 levels "Base","Riskier",..: 2 2 2 2 2 2 2 2 2 1 ...
##  $ cylinderNum     : Factor w/ 5 levels "five","four",..: 2 2 5 2 1 1 1 1 1 1 ...
##  $ carManufacturer : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...

1.2.4 Visualisation

Have chosen price as response variable. Want to see how it’s distributed… (might want to use log(price) as used in Intro to Python).

ggplot(df, aes(x = price)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  ggtitle("Distribution of Car Prices")

Look at numerical variables vs price

# price vs horsepower
ggplot(df, aes(x = horsepower, y = price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  ggtitle("Price vs Horsepower") +
  theme_minimal()

# Price vs engine size
ggplot(df, aes(x = enginesize, y = price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  ggtitle("Price vs Engine Size") +
  theme_minimal()

# Price vs highwaympg (could also use citympg)
ggplot(df, aes(x = highwaympg, y = price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  ggtitle("Price vs MPG (Highway)") +
  theme_minimal()

# Price vs citympg (note: city vs highwaympg are likely to be correlated)
ggplot(df, aes(x = citympg, y = price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  ggtitle("Price vs MPG (City)") +
  theme_minimal()

This is only a quick look, but clear issues with negative price assuming simple linear regression > ~40 mpg for both city and highway mpg

# Price vs fuel type

ggplot(df, aes(x = fueltype, y = price)) +
  geom_boxplot() +
  ggtitle("Price
Distribution by Fuel Type") +
  theme_minimal()

# Price vs car body

ggplot(df, aes(x = carbody, y = price)) +
  geom_boxplot() +
  ggtitle("Price
Distribution by Car Body") +
  theme_minimal()

# Price vs (new) manufacturer

ggplot(df, aes(x = carManufacturer, y = price)) +
  geom_boxplot() +
  ggtitle("Price Distribution by Manufacturer") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

This is not unexpected, but is quite interesting to see. Some manufacturers are more expensive (& the range of models produced is also variable).

Based upon our personal investigation & search we think the data we have been given here are a subset of the data from 1985 Ward’s Automotive Yearbook (with a more comprehensive data set being available from https://archive.ics.uci.edu/dataset/10/automobile).

Note, this doesn’t consider companies that own other companies (i.e., Toyota currently owns Lexus, but both could be considered to be toyota despite both having different price points. This consideration is something we decided was beyond the scope of this analysis).

2 Pricing Assistant Model

Our main goal is to get an accurate predicted price \[\hat{y}\] for a new car.

BE note: A good pricing assistant chimes with goals of assessment and the lecture content.

2.1 Attempt 01

Excluding predictors that are highly correlated (parameter choice)

# Y = price
# X = horsepower, enginesize, curbweight, highwaympg,
#     fueltype, aspiration, carbody, drivewheel, cylinderNum

mdl.pa1 <- lm(
  price ~ horsepower + enginesize + curbweight + highwaympg +
    fueltype + aspiration + carbody + drivewheel + cylinderNum,
  data = df
)

print(summary(mdl.pa1))
## 
## Call:
## lm(formula = price ~ horsepower + enginesize + curbweight + highwaympg + 
##     fueltype + aspiration + carbody + drivewheel + cylinderNum, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8084.9 -1197.2   -81.7  1316.3 14226.0 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -765.673   5565.870  -0.138  0.89073    
## horsepower              44.333     13.657   3.246  0.00138 ** 
## enginesize              56.908     18.424   3.089  0.00231 ** 
## curbweight               3.238      1.518   2.133  0.03420 *  
## highwaympg              46.589     74.310   0.627  0.53145    
## fueltypegas           -629.559   1227.468  -0.513  0.60863    
## aspirationturbo        400.001    825.850   0.484  0.62870    
## carbodyhardtop       -2067.166   1699.423  -1.216  0.22536    
## carbodyhatchback     -4742.402   1360.316  -3.486  0.00061 ***
## carbodysedan         -3091.102   1336.340  -2.313  0.02180 *  
## carbodywagon         -4489.628   1498.218  -2.997  0.00310 ** 
## drivewheelfwd           26.116   1141.418   0.023  0.98177    
## drivewheelrwd         1711.372   1229.935   1.391  0.16574    
## cylinderNumfour      -4860.555   1113.372  -4.366 2.09e-05 ***
## cylinderNumgeq_eight  2254.206   2250.742   1.002  0.31785    
## cylinderNumleq_three  -655.617   2039.925  -0.321  0.74827    
## cylinderNumsix       -2068.122   1369.065  -1.511  0.13257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3053 on 188 degrees of freedom
## Multiple R-squared:  0.8654, Adjusted R-squared:  0.8539 
## F-statistic: 75.54 on 16 and 188 DF,  p-value: < 2.2e-16

F-statistic: 75.54 on 16 and 188 DF, p-value: < 2.2e-16

  • Test of null hypothesis that \(H_0: \beta_1 = \beta_2 = \dots = 0\)
  • Meaning all predictors are useless
  • F-value is large and p-value small so we can reject this null hypothesis

Adjusted R-squared: 0.8539

BE comments: Model accounts for ~85% of variability in the variables (Adj. R-squared = 85.4%, F-statistic p-value < 2.2e-16).

Quite a few predictors have high p-values (not statistically significant)

Next step to use Backward Selection process to run algorithm removing the variable with the largest p-value and re-fitting the model…

mdl.pa2 <- stepAIC(mdl.pa1, direction = "backward", trace = TRUE)
## Start:  AIC=3306.08
## price ~ horsepower + enginesize + curbweight + highwaympg + fueltype + 
##     aspiration + carbody + drivewheel + cylinderNum
## 
##               Df Sum of Sq        RSS    AIC
## - aspiration   1   2186978 1754787582 3304.3
## - fueltype     1   2452323 1755052928 3304.4
## - highwaympg   1   3664316 1756264920 3304.5
## <none>                     1752600604 3306.1
## - drivewheel   2  53740715 1806341319 3308.3
## - curbweight   1  42424115 1795024719 3309.0
## - enginesize   1  88938316 1841538921 3314.2
## - horsepower   1  98234746 1850835351 3315.3
## - carbody      4 193349724 1945950328 3319.5
## - cylinderNum  4 327945047 2080545651 3333.2
## 
## Step:  AIC=3304.33
## price ~ horsepower + enginesize + curbweight + highwaympg + fueltype + 
##     carbody + drivewheel + cylinderNum
## 
##               Df Sum of Sq        RSS    AIC
## - highwaympg   1   2819841 1757607423 3302.7
## - fueltype     1   7240661 1762028243 3303.2
## <none>                     1754787582 3304.3
## - drivewheel   2  51554337 1806341919 3306.3
## - curbweight   1  44899211 1799686793 3307.5
## - enginesize   1  87087320 1841874902 3312.3
## - carbody      4 191244297 1946031880 3317.5
## - horsepower   1 141840989 1896628571 3318.3
## - cylinderNum  4 340069822 2094857404 3332.6
## 
## Step:  AIC=3302.66
## price ~ horsepower + enginesize + curbweight + fueltype + carbody + 
##     drivewheel + cylinderNum
## 
##               Df Sum of Sq        RSS    AIC
## <none>                     1757607423 3302.7
## - fueltype     1  18328613 1775936037 3302.8
## - drivewheel   2  51993718 1809601142 3304.6
## - curbweight   1  47386509 1804993933 3306.1
## - enginesize   1  90937348 1848544771 3311.0
## - carbody      4 188713756 1946321179 3315.6
## - horsepower   1 140987873 1898595296 3316.5
## - cylinderNum  4 337658895 2095266319 3330.7

2.1.1 Attempt 01: Model comparison

See if variables removed as a group were significant.

Hypothesis:

  • \(H_0\)​: The simpler model (mdl.pa2) is sufficient
  • \(H_1\)​: The first (full) model (mdl.pa1) is significantly better.
print(anova(mdl.pa2, mdl.pa1))
## Analysis of Variance Table
## 
## Model 1: price ~ horsepower + enginesize + curbweight + fueltype + carbody + 
##     drivewheel + cylinderNum
## Model 2: price ~ horsepower + enginesize + curbweight + highwaympg + fueltype + 
##     aspiration + carbody + drivewheel + cylinderNum
##   Res.Df        RSS Df Sum of Sq      F Pr(>F)
## 1    190 1757607423                           
## 2    188 1752600604  2   5006819 0.2685 0.7648
par(bg = "white")
par(mfrow = c(2, 2))
plot(mdl.pa2)

par(mfrow = c(1, 1))
vif(mdl.pa2)
##                  GVIF Df GVIF^(1/(2*Df))
## horsepower   4.717089  1        2.171886
## enginesize  12.163868  1        3.487674
## curbweight   9.581354  1        3.095376
## fueltype     1.529787  1        1.236846
## carbody      1.980039  4        1.089141
## drivewheel   2.796677  2        1.293185
## cylinderNum  7.050859  4        1.276528

Brief BE summary:

  • VIF output is good - all values are below the 5 threshold, so collinearity is not a major issue in this model

Plots show two issues

  • Heteroscedasticity in residuals vs fitted (have funnel shape as the residuals get larger as the predicted price increases)
  • Some leverage points / large outliers

2.2 Attempt 02

To fix heteroscedasticity let’s use log(price) & look at what happens if we include carManufacturer?

mdl.pa3 <- lm(
  log(price) ~ horsepower + enginesize + curbweight + highwaympg +
    fueltype + aspiration + carbody + drivewheel + cylinderNum +
    carManufacturer,
  data = df
)
summary(mdl.pa3)
## 
## Call:
## lm(formula = log(price) ~ horsepower + enginesize + curbweight + 
##     highwaympg + fueltype + aspiration + carbody + drivewheel + 
##     cylinderNum + carManufacturer, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38320 -0.06125  0.00149  0.07107  0.39165 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.921e+00  2.881e-01  27.498  < 2e-16 ***
## horsepower                 1.845e-03  9.248e-04   1.995 0.047619 *  
## enginesize                 1.392e-03  9.460e-04   1.471 0.143096    
## curbweight                 4.859e-04  7.902e-05   6.149 5.56e-09 ***
## highwaympg                -2.788e-03  3.509e-03  -0.795 0.427919    
## fueltypegas               -1.764e-02  6.014e-02  -0.293 0.769692    
## aspirationturbo            7.776e-02  4.274e-02   1.819 0.070675 .  
## carbodyhardtop            -1.651e-01  7.622e-02  -2.166 0.031741 *  
## carbodyhatchback          -2.283e-01  6.453e-02  -3.539 0.000521 ***
## carbodysedan              -1.645e-01  6.479e-02  -2.538 0.012051 *  
## carbodywagon              -2.453e-01  7.022e-02  -3.493 0.000611 ***
## drivewheelfwd              4.103e-02  5.471e-02   0.750 0.454396    
## drivewheelrwd              7.213e-02  6.621e-02   1.089 0.277540    
## cylinderNumfour            1.106e-01  8.426e-02   1.313 0.191139    
## cylinderNumgeq_eight      -3.306e-02  1.046e-01  -0.316 0.752281    
## cylinderNumleq_three       3.267e-01  1.145e-01   2.854 0.004861 ** 
## cylinderNumsix             1.219e-01  8.638e-02   1.411 0.160179    
## carManufactureraudi        2.936e-01  1.206e-01   2.435 0.015924 *  
## carManufacturerbmw         3.465e-01  9.778e-02   3.543 0.000513 ***
## carManufacturerbuick       2.318e-01  1.365e-01   1.697 0.091480 .  
## carManufacturerchevrolet  -1.765e-01  1.241e-01  -1.422 0.156774    
## carManufacturerdodge      -1.828e-01  9.899e-02  -1.847 0.066510 .  
## carManufacturerhonda      -7.288e-02  9.518e-02  -0.766 0.444941    
## carManufacturerisuzu      -6.875e-02  1.073e-01  -0.641 0.522474    
## carManufacturerjaguar     -1.046e-01  1.345e-01  -0.778 0.437904    
## carManufacturermazda      -3.609e-02  9.384e-02  -0.385 0.701016    
## carManufacturermercury    -8.873e-02  1.551e-01  -0.572 0.568014    
## carManufacturermitsubishi -2.249e-01  9.760e-02  -2.304 0.022457 *  
## carManufacturernissan     -1.415e-01  9.224e-02  -1.534 0.126826    
## carManufacturerpeugeot    -1.686e-01  1.065e-01  -1.583 0.115363    
## carManufacturerplymouth   -2.120e-01  1.004e-01  -2.112 0.036137 *  
## carManufacturerporsche     4.521e-01  1.084e-01   4.170 4.88e-05 ***
## carManufacturerrenault    -1.369e-01  1.282e-01  -1.068 0.286926    
## carManufacturersaab        7.133e-02  1.082e-01   0.659 0.510726    
## carManufacturersubaru     -1.630e-01  9.795e-02  -1.664 0.098036 .  
## carManufacturertoyota     -1.511e-01  8.732e-02  -1.730 0.085431 .  
## carManufacturervolkswagen -4.404e-02  9.448e-02  -0.466 0.641742    
## carManufacturervolvo       9.085e-03  9.783e-02   0.093 0.926122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1248 on 167 degrees of freedom
## Multiple R-squared:  0.9498, Adjusted R-squared:  0.9386 
## F-statistic: 85.34 on 37 and 167 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(mdl.pa3)

par(mfrow = c(1, 1))
mdl.pa4 <- stepAIC(mdl.pa3, direction = "backward", trace = TRUE)
## Start:  AIC=-819.25
## log(price) ~ horsepower + enginesize + curbweight + highwaympg + 
##     fueltype + aspiration + carbody + drivewheel + cylinderNum + 
##     carManufacturer
## 
##                   Df Sum of Sq    RSS     AIC
## - drivewheel       2   0.01873 2.6198 -821.78
## - fueltype         1   0.00134 2.6024 -821.15
## - highwaympg       1   0.00984 2.6109 -820.48
## <none>                         2.6011 -819.25
## - enginesize       1   0.03372 2.6348 -818.61
## - aspiration       1   0.05155 2.6526 -817.23
## - horsepower       1   0.06202 2.6631 -816.42
## - cylinderNum      4   0.15826 2.7593 -815.14
## - carbody          4   0.33738 2.9385 -802.25
## - curbweight       1   0.58891 3.1900 -779.41
## - carManufacturer 21   2.77342 5.3745 -712.48
## 
## Step:  AIC=-821.78
## log(price) ~ horsepower + enginesize + curbweight + highwaympg + 
##     fueltype + aspiration + carbody + cylinderNum + carManufacturer
## 
##                   Df Sum of Sq    RSS     AIC
## - fueltype         1   0.00495 2.6248 -823.39
## - highwaympg       1   0.01111 2.6309 -822.91
## <none>                         2.6198 -821.78
## - enginesize       1   0.03664 2.6565 -820.93
## - aspiration       1   0.03803 2.6578 -820.83
## - horsepower       1   0.10127 2.7211 -816.01
## - cylinderNum      4   0.19570 2.8155 -815.01
## - carbody          4   0.34719 2.9670 -804.27
## - curbweight       1   0.61217 3.2320 -780.73
## - carManufacturer 21   2.92664 5.5465 -710.02
## 
## Step:  AIC=-823.39
## log(price) ~ horsepower + enginesize + curbweight + highwaympg + 
##     aspiration + carbody + cylinderNum + carManufacturer
## 
##                   Df Sum of Sq    RSS     AIC
## - highwaympg       1   0.00642 2.6312 -824.89
## <none>                         2.6248 -823.39
## - enginesize       1   0.04331 2.6681 -822.04
## - aspiration       1   0.08273 2.7075 -819.03
## - horsepower       1   0.10214 2.7269 -817.57
## - cylinderNum      4   0.21732 2.8421 -815.09
## - carbody          4   0.35725 2.9820 -805.23
## - curbweight       1   0.78807 3.4128 -771.57
## - carManufacturer 21   3.07072 5.6955 -706.58
## 
## Step:  AIC=-824.89
## log(price) ~ horsepower + enginesize + curbweight + aspiration + 
##     carbody + cylinderNum + carManufacturer
## 
##                   Df Sum of Sq    RSS     AIC
## <none>                         2.6312 -824.89
## - enginesize       1   0.04007 2.6713 -823.79
## - aspiration       1   0.07675 2.7079 -821.00
## - cylinderNum      4   0.23362 2.8648 -815.45
## - horsepower       1   0.17210 2.8033 -813.90
## - carbody          4   0.36112 2.9923 -806.53
## - curbweight       1   0.90130 3.5325 -766.51
## - carManufacturer 21   3.06450 5.6957 -708.58
summary(mdl.pa4)
## 
## Call:
## lm(formula = log(price) ~ horsepower + enginesize + curbweight + 
##     aspiration + carbody + cylinderNum + carManufacturer, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35131 -0.06436  0.00549  0.06560  0.41121 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.785e+00  1.636e-01  47.580  < 2e-16 ***
## horsepower                 2.234e-03  6.681e-04   3.344 0.001013 ** 
## enginesize                 1.447e-03  8.969e-04   1.614 0.108415    
## curbweight                 5.068e-04  6.622e-05   7.653 1.37e-12 ***
## aspirationturbo            7.176e-02  3.213e-02   2.233 0.026821 *  
## carbodyhardtop            -1.666e-01  7.531e-02  -2.213 0.028256 *  
## carbodyhatchback          -2.359e-01  6.357e-02  -3.710 0.000280 ***
## carbodysedan              -1.730e-01  6.330e-02  -2.734 0.006922 ** 
## carbodywagon              -2.583e-01  6.863e-02  -3.763 0.000230 ***
## cylinderNumfour            1.188e-01  8.188e-02   1.451 0.148706    
## cylinderNumgeq_eight      -6.697e-02  1.010e-01  -0.663 0.508069    
## cylinderNumleq_three       3.681e-01  1.092e-01   3.370 0.000928 ***
## cylinderNumsix             1.245e-01  8.534e-02   1.458 0.146543    
## carManufactureraudi        2.699e-01  1.142e-01   2.364 0.019184 *  
## carManufacturerbmw         3.455e-01  9.617e-02   3.592 0.000428 ***
## carManufacturerbuick       2.512e-01  1.316e-01   1.909 0.058000 .  
## carManufacturerchevrolet  -2.239e-01  1.160e-01  -1.931 0.055195 .  
## carManufacturerdodge      -2.013e-01  9.465e-02  -2.127 0.034895 *  
## carManufacturerhonda      -9.350e-02  9.116e-02  -1.026 0.306487    
## carManufacturerisuzu      -7.672e-02  1.051e-01  -0.730 0.466217    
## carManufacturerjaguar     -1.273e-01  1.302e-01  -0.978 0.329502    
## carManufacturermazda      -4.852e-02  9.119e-02  -0.532 0.595326    
## carManufacturermercury    -9.729e-02  1.538e-01  -0.633 0.527732    
## carManufacturermitsubishi -2.475e-01  9.162e-02  -2.702 0.007593 ** 
## carManufacturernissan     -1.631e-01  8.862e-02  -1.841 0.067378 .  
## carManufacturerpeugeot    -1.536e-01  1.032e-01  -1.488 0.138565    
## carManufacturerplymouth   -2.281e-01  9.692e-02  -2.353 0.019755 *  
## carManufacturerporsche     4.238e-01  1.026e-01   4.130 5.65e-05 ***
## carManufacturerrenault    -1.577e-01  1.237e-01  -1.275 0.204105    
## carManufacturersaab        4.221e-02  9.970e-02   0.423 0.672556    
## carManufacturersubaru     -1.925e-01  9.212e-02  -2.090 0.038133 *  
## carManufacturertoyota     -1.648e-01  8.551e-02  -1.927 0.055623 .  
## carManufacturervolkswagen -6.172e-02  9.032e-02  -0.683 0.495316    
## carManufacturervolvo       1.254e-02  9.489e-02   0.132 0.894990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.124 on 171 degrees of freedom
## Multiple R-squared:  0.9492, Adjusted R-squared:  0.9394 
## F-statistic: 96.79 on 33 and 171 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(mdl.pa4)

par(mfrow = c(1, 1))
print(vif(mdl.pa4))
##                       GVIF Df GVIF^(1/(2*Df))
## horsepower        9.254549  1        3.042129
## enginesize       18.493058  1        4.300356
## curbweight       15.763028  1        3.970268
## aspiration        2.034585  1        1.426389
## carbody           4.036524  4        1.190559
## cylinderNum      71.722557  4        1.705913
## carManufacturer 383.938806 21        1.152206
print(anova(mdl.pa3, mdl.pa4))
## Analysis of Variance Table
## 
## Model 1: log(price) ~ horsepower + enginesize + curbweight + highwaympg + 
##     fueltype + aspiration + carbody + drivewheel + cylinderNum + 
##     carManufacturer
## Model 2: log(price) ~ horsepower + enginesize + curbweight + aspiration + 
##     carbody + cylinderNum + carManufacturer
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    167 2.6011                           
## 2    171 2.6312 -4 -0.030097 0.4831 0.7481

2.2.1 Summary

  • mdl.pa4 now shows a random, horizontal band of points. The red line is flat. This indicates the variance of the residuals is now constant, and we can trust our p-values and standard errors.
  • GVIF values are high, but we have lots of categories. Importantly all GVIF^(1/(2*Df)) are below 5 -> no collinearity in model
  • Anova test with two models fails to reject null hypothesis (\(H_0\)​) that all the variables you removed (highwaympg, fueltype, and drivewheel) have coefficients equal to zero and add no significant predictive value to the model.
    • p-value (0.7481) is large -> there is no statistical evidence that highwaympg, fueltype, and drivewheel as a group improve the model.

mdl.pa4 (simpiler) is just as good as mdl.pa3, so we should use mdl.pa4.

2.3 Demo: testing this model’s use to answer our initial question

BE note: we know from data source that these prices are in dollars…

# Example properties of new car
# BE note: have to use exact text for factor vals
# Also a limitation of this (not built in) is that it doesn't check for limits
# of data (i.e., if we put in a horsepower over the max value the model won't
# be valid, but this isn't accounted for here...)
# this also doesn't account for types of engine produced by companies
new_car <- data.frame(
  horsepower = 100,
  enginesize = 120,
  curbweight = 2500,
  aspiration = as.factor("std"),
  carbody = as.factor("sedan"),
  cylinderNum = as.factor("four"),
  carManufacturer = as.factor("toyota") # peugeot/volvo/porsche
)

# use predict() to estimate fit
pred_log_scale <-
  predict(mdl.pa4, newdata = new_car, interval = "prediction")

print("Prediction on log($) scale:")
## [1] "Prediction on log($) scale:"
print(pred_log_scale)
##        fit      lwr      upr
## 1 9.230509 8.979491 9.481526
# convert back from log price for fit, lwr, upr
pred_dollar_scale <- exp(pred_log_scale)

print("Prediction on dollar ($) scale:")
## [1] "Prediction on dollar ($) scale:"
print(pred_dollar_scale)
##        fit      lwr      upr
## 1 10203.73 7938.589 13115.19
# All in one...
car_props <- paste(
  new_car$carManufacturer, new_car$carbody,
  "with", new_car$horsepower, "hp,",
  new_car$enginesize, "enginesize, and",
  new_car$curbweight, "lb curbweight"
)

fit_price <- pred_dollar_scale[1, "fit"]
lwr_price <- pred_dollar_scale[1, "lwr"]
upr_price <- pred_dollar_scale[1, "upr"]

report_string <- sprintf(
  "\nBased on our final model, for a %s:
  \n> The single best price estimate (yhat) is $%.0f.
  \n> We can say we are 95%% confident that the listing price for a car with
  these specific specifications would fall within prediction interval of
  [$%.0f, $%.0f].",
  car_props,
  fit_price,
  lwr_price,
  upr_price
)
cat(report_string)
## 
## Based on our final model, for a toyota sedan with 100 hp, 120 enginesize, and 2500 lb curbweight:
##   
## > The single best price estimate (yhat) is $10204.
##   
## > We can say we are 95% confident that the listing price for a car with
##   these specific specifications would fall within prediction interval of
##   [$7939, $13115].

2.3.1 Demo: testing function

get_price_prediction_report <- function(model,
                                        hp,
                                        engine_size,
                                        curb_weight,
                                        asp,
                                        body,
                                        cyl,
                                        mfr) {
  new_car <- data.frame(
    horsepower = hp,
    enginesize = engine_size,
    curbweight = curb_weight,
    aspiration = as.factor(asp),
    carbody = as.factor(body),
    cylinderNum = as.factor(cyl),
    carManufacturer = as.factor(mfr)
  )
  pred_log_scale <- predict(model, newdata = new_car, interval = "prediction")
  pred_dollar_scale <- exp(pred_log_scale)
  car_props <- paste(
    new_car$carManufacturer, new_car$carbody,
    "with", new_car$horsepower, "hp,",
    new_car$enginesize, "enginesize, and",
    new_car$curbweight, "lb curbweight"
  )
  fit_price <- pred_dollar_scale[1, "fit"]
  lwr_price <- pred_dollar_scale[1, "lwr"]
  upr_price <- pred_dollar_scale[1, "upr"]
  report_string <- sprintf(
    "\nBased on our final model, for a %s:
    > The single best price estimate (y-hat) is $%.0f
    > 95%% CI for this specific car is [$%.0f, $%.0f]",
    car_props,
    fit_price,
    lwr_price,
    upr_price
  )

  cat(report_string)
}
# test 01: same toyota (to check :))
get_price_prediction_report(
  model = mdl.pa4,
  hp = 100,
  engine_size = 120,
  curb_weight = 2500,
  asp = "std",
  body = "sedan",
  cyl = "four",
  mfr = "toyota"
)
## 
## Based on our final model, for a toyota sedan with 100 hp, 120 enginesize, and 2500 lb curbweight:
##     > The single best price estimate (y-hat) is $10204
##     > 95% CI for this specific car is [$7939, $13115]
# test 02: porsche
get_price_prediction_report(
  model = mdl.pa4,
  hp = 200,
  engine_size = 180,
  curb_weight = 3000,
  asp = "turbo",
  body = "hardtop",
  cyl = "six",
  mfr = "porsche"
)
## 
## Based on our final model, for a porsche hardtop with 200 hp, 180 enginesize, and 3000 lb curbweight:
##     > The single best price estimate (y-hat) is $35127
##     > 95% CI for this specific car is [$26391, $46756]
# test 03: honda
get_price_prediction_report(
  model = mdl.pa4,
  hp = 150,
  engine_size = 120,
  curb_weight = 1000,
  asp = "std",
  body = "convertible",
  cyl = "five",
  mfr = "honda"
)
## 
## Based on our final model, for a honda convertible with 150 hp, 120 enginesize, and 1000 lb curbweight:
##     > The single best price estimate (y-hat) is $6048
##     > 95% CI for this specific car is [$4075, $8977]

2.4 Limitations

Observation 76 is a high-leverage point

  • coefficients are being skewed by this single observation
  • /model is overfit to this point

2.5 Suggestions for Improvements

  • We could remove this point and refit the model to see if the coefficients are stable and the model is more general
  • BE note: don’t know if we want to do this in the model or leave it and discuss here?

2.5.1 Attempt 03 (may not include in final version)

row_to_remove <- 76

df_fixed <- df[-row_to_remove, ]

mdl.pa5 <- lm(formula(mdl.pa4), data = df_fixed)

print(summary(mdl.pa4))
## 
## Call:
## lm(formula = log(price) ~ horsepower + enginesize + curbweight + 
##     aspiration + carbody + cylinderNum + carManufacturer, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35131 -0.06436  0.00549  0.06560  0.41121 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.785e+00  1.636e-01  47.580  < 2e-16 ***
## horsepower                 2.234e-03  6.681e-04   3.344 0.001013 ** 
## enginesize                 1.447e-03  8.969e-04   1.614 0.108415    
## curbweight                 5.068e-04  6.622e-05   7.653 1.37e-12 ***
## aspirationturbo            7.176e-02  3.213e-02   2.233 0.026821 *  
## carbodyhardtop            -1.666e-01  7.531e-02  -2.213 0.028256 *  
## carbodyhatchback          -2.359e-01  6.357e-02  -3.710 0.000280 ***
## carbodysedan              -1.730e-01  6.330e-02  -2.734 0.006922 ** 
## carbodywagon              -2.583e-01  6.863e-02  -3.763 0.000230 ***
## cylinderNumfour            1.188e-01  8.188e-02   1.451 0.148706    
## cylinderNumgeq_eight      -6.697e-02  1.010e-01  -0.663 0.508069    
## cylinderNumleq_three       3.681e-01  1.092e-01   3.370 0.000928 ***
## cylinderNumsix             1.245e-01  8.534e-02   1.458 0.146543    
## carManufactureraudi        2.699e-01  1.142e-01   2.364 0.019184 *  
## carManufacturerbmw         3.455e-01  9.617e-02   3.592 0.000428 ***
## carManufacturerbuick       2.512e-01  1.316e-01   1.909 0.058000 .  
## carManufacturerchevrolet  -2.239e-01  1.160e-01  -1.931 0.055195 .  
## carManufacturerdodge      -2.013e-01  9.465e-02  -2.127 0.034895 *  
## carManufacturerhonda      -9.350e-02  9.116e-02  -1.026 0.306487    
## carManufacturerisuzu      -7.672e-02  1.051e-01  -0.730 0.466217    
## carManufacturerjaguar     -1.273e-01  1.302e-01  -0.978 0.329502    
## carManufacturermazda      -4.852e-02  9.119e-02  -0.532 0.595326    
## carManufacturermercury    -9.729e-02  1.538e-01  -0.633 0.527732    
## carManufacturermitsubishi -2.475e-01  9.162e-02  -2.702 0.007593 ** 
## carManufacturernissan     -1.631e-01  8.862e-02  -1.841 0.067378 .  
## carManufacturerpeugeot    -1.536e-01  1.032e-01  -1.488 0.138565    
## carManufacturerplymouth   -2.281e-01  9.692e-02  -2.353 0.019755 *  
## carManufacturerporsche     4.238e-01  1.026e-01   4.130 5.65e-05 ***
## carManufacturerrenault    -1.577e-01  1.237e-01  -1.275 0.204105    
## carManufacturersaab        4.221e-02  9.970e-02   0.423 0.672556    
## carManufacturersubaru     -1.925e-01  9.212e-02  -2.090 0.038133 *  
## carManufacturertoyota     -1.648e-01  8.551e-02  -1.927 0.055623 .  
## carManufacturervolkswagen -6.172e-02  9.032e-02  -0.683 0.495316    
## carManufacturervolvo       1.254e-02  9.489e-02   0.132 0.894990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.124 on 171 degrees of freedom
## Multiple R-squared:  0.9492, Adjusted R-squared:  0.9394 
## F-statistic: 96.79 on 33 and 171 DF,  p-value: < 2.2e-16
print(summary(mdl.pa5))
## 
## Call:
## lm(formula = formula(mdl.pa4), data = df_fixed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35131 -0.06456  0.00564  0.06565  0.41121 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.785e+00  1.636e-01  47.580  < 2e-16 ***
## horsepower                 2.234e-03  6.681e-04   3.344 0.001013 ** 
## enginesize                 1.447e-03  8.969e-04   1.614 0.108415    
## curbweight                 5.068e-04  6.622e-05   7.653 1.37e-12 ***
## aspirationturbo            7.176e-02  3.213e-02   2.233 0.026821 *  
## carbodyhardtop            -1.666e-01  7.531e-02  -2.213 0.028256 *  
## carbodyhatchback          -2.359e-01  6.357e-02  -3.710 0.000280 ***
## carbodysedan              -1.730e-01  6.330e-02  -2.734 0.006922 ** 
## carbodywagon              -2.583e-01  6.863e-02  -3.763 0.000230 ***
## cylinderNumfour            1.188e-01  8.188e-02   1.451 0.148706    
## cylinderNumgeq_eight      -6.697e-02  1.010e-01  -0.663 0.508069    
## cylinderNumleq_three       3.681e-01  1.092e-01   3.370 0.000928 ***
## cylinderNumsix             1.245e-01  8.534e-02   1.458 0.146543    
## carManufactureraudi        2.699e-01  1.142e-01   2.364 0.019184 *  
## carManufacturerbmw         3.455e-01  9.617e-02   3.592 0.000428 ***
## carManufacturerbuick       2.512e-01  1.316e-01   1.909 0.058000 .  
## carManufacturerchevrolet  -2.239e-01  1.160e-01  -1.931 0.055195 .  
## carManufacturerdodge      -2.013e-01  9.465e-02  -2.127 0.034895 *  
## carManufacturerhonda      -9.350e-02  9.116e-02  -1.026 0.306487    
## carManufacturerisuzu      -7.672e-02  1.051e-01  -0.730 0.466217    
## carManufacturerjaguar     -1.273e-01  1.302e-01  -0.978 0.329502    
## carManufacturermazda      -4.852e-02  9.119e-02  -0.532 0.595326    
## carManufacturermitsubishi -2.475e-01  9.162e-02  -2.702 0.007593 ** 
## carManufacturernissan     -1.631e-01  8.862e-02  -1.841 0.067378 .  
## carManufacturerpeugeot    -1.536e-01  1.032e-01  -1.488 0.138565    
## carManufacturerplymouth   -2.281e-01  9.692e-02  -2.353 0.019755 *  
## carManufacturerporsche     4.238e-01  1.026e-01   4.130 5.65e-05 ***
## carManufacturerrenault    -1.577e-01  1.237e-01  -1.275 0.204105    
## carManufacturersaab        4.221e-02  9.970e-02   0.423 0.672556    
## carManufacturersubaru     -1.925e-01  9.212e-02  -2.090 0.038133 *  
## carManufacturertoyota     -1.648e-01  8.551e-02  -1.927 0.055623 .  
## carManufacturervolkswagen -6.172e-02  9.032e-02  -0.683 0.495316    
## carManufacturervolvo       1.254e-02  9.489e-02   0.132 0.894990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.124 on 171 degrees of freedom
## Multiple R-squared:  0.9491, Adjusted R-squared:  0.9395 
## F-statistic: 99.56 on 32 and 171 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(mdl.pa5)

par(mfrow = c(1, 1))

cat("\n--- VIF Scores for Robust Final Model ---\n")
## 
## --- VIF Scores for Robust Final Model ---
print(car::vif(mdl.pa5))
##                       GVIF Df GVIF^(1/(2*Df))
## horsepower        9.108072  1        3.017958
## enginesize       18.484053  1        4.299308
## curbweight       15.727048  1        3.965734
## aspiration        1.989300  1        1.410425
## carbody           3.998363  4        1.189146
## cylinderNum      71.620842  4        1.705611
## carManufacturer 359.645296 20        1.158502

3 MPG Model

3.1 Attempt 01

Here we are asking: “How does fuel economy (citympg) affect a car’s price, after controlling for other characteristics?”

vars <- c(
  "price", "safetyIncr", "aspiration", "carbody", "enginelocation",
  "carwidth", "curbweight", "enginetype", "cylinderNum", "enginesize",
  "stroke", "peakrpm", "compressionratio", "carManufacturer", "citympg"
)

dfm <- df[, vars]
dfm <- dfm[complete.cases(dfm), ]
dfm[] <- lapply(dfm, function(x) if (is.factor(x)) droplevels(x) else x)

# BE note: mpg1 is not included here, but have kept naming the same for
# consistency
lm.mpg2 <- lm(
  price ~ safetyIncr + aspiration + carbody +
    enginelocation + carwidth + curbweight + enginetype +
    cylinderNum + enginesize + stroke + peakrpm + compressionratio +
    carManufacturer + citympg,
  data = dfm
)
print(summary(lm.mpg2))
## 
## Call:
## lm(formula = price ~ safetyIncr + aspiration + carbody + enginelocation + 
##     carwidth + curbweight + enginetype + cylinderNum + enginesize + 
##     stroke + peakrpm + compressionratio + carManufacturer + citympg, 
##     data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3625.7  -913.2     0.0   877.3  8457.7 
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -5.859e+04  1.172e+04  -4.998 1.52e-06 ***
## safetyIncr1                5.175e+02  7.122e+02   0.727 0.468546    
## safetyIncr2                1.436e+03  7.279e+02   1.973 0.050274 .  
## safetyIncr3                1.408e+03  7.726e+02   1.823 0.070266 .  
## safetyIncr4                9.524e+02  9.365e+02   1.017 0.310726    
## aspirationturbo            2.132e+03  5.085e+02   4.193 4.57e-05 ***
## carbodyhardtop            -2.688e+03  1.196e+03  -2.248 0.025978 *  
## carbodyhatchback          -2.997e+03  1.047e+03  -2.861 0.004790 ** 
## carbodysedan              -2.915e+03  1.094e+03  -2.665 0.008502 ** 
## carbodywagon              -3.776e+03  1.149e+03  -3.287 0.001247 ** 
## enginelocationrear         1.189e+04  3.126e+03   3.803 0.000204 ***
## carwidth                   6.679e+02  1.947e+02   3.431 0.000767 ***
## curbweight                 4.573e+00  1.350e+00   3.388 0.000888 ***
## enginetypedohcv           -3.609e+03  3.558e+03  -1.014 0.312029    
## enginetypel               -4.170e+03  1.733e+03  -2.406 0.017263 *  
## enginetypeohc              1.352e+03  1.032e+03   1.310 0.192061    
## enginetypeohcf            -3.332e+03  1.589e+03  -2.097 0.037608 *  
## enginetypeohcv            -3.126e+03  1.188e+03  -2.632 0.009325 ** 
## enginetyperotor           -3.855e+03  3.371e+03  -1.144 0.254517    
## cylinderNumfour            3.030e+03  1.373e+03   2.207 0.028763 *  
## cylinderNumgeq_eight       6.613e+03  2.208e+03   2.996 0.003181 ** 
## cylinderNumleq_three       1.382e+04  3.516e+03   3.931 0.000126 ***
## cylinderNumsix             5.441e+03  1.586e+03   3.431 0.000768 ***
## enginesize                 7.207e+01  1.670e+01   4.316 2.79e-05 ***
## stroke                    -1.416e+03  9.024e+02  -1.569 0.118617    
## peakrpm                    1.865e+00  5.866e-01   3.180 0.001773 ** 
## compressionratio          -6.940e+01  6.173e+01  -1.124 0.262612    
## carManufactureraudi        1.051e+03  2.144e+03   0.490 0.624764    
## carManufacturerbmw         3.851e+03  1.955e+03   1.969 0.050654 .  
## carManufacturerbuick       5.182e+03  2.130e+03   2.433 0.016103 *  
## carManufacturerchevrolet  -4.532e+03  2.100e+03  -2.157 0.032480 *  
## carManufacturerdodge      -4.590e+03  1.693e+03  -2.712 0.007438 ** 
## carManufacturerhonda      -4.072e+03  1.787e+03  -2.278 0.024059 *  
## carManufacturerisuzu      -2.902e+03  1.791e+03  -1.620 0.107121    
## carManufacturerjaguar     -1.256e+02  2.199e+03  -0.057 0.954545    
## carManufacturermazda      -3.229e+03  1.545e+03  -2.090 0.038204 *  
## carManufacturermercury    -5.075e+03  2.488e+03  -2.039 0.043074 *  
## carManufacturermitsubishi -5.207e+03  1.686e+03  -3.088 0.002383 ** 
## carManufacturernissan     -3.816e+03  1.521e+03  -2.508 0.013141 *  
## carManufacturerpeugeot            NA         NA      NA       NA    
## carManufacturerplymouth   -4.899e+03  1.724e+03  -2.841 0.005087 ** 
## carManufacturerporsche     2.776e+03  2.532e+03   1.096 0.274553    
## carManufacturerrenault    -5.121e+03  2.146e+03  -2.386 0.018230 *  
## carManufacturersaab       -1.211e+03  1.704e+03  -0.711 0.478210    
## carManufacturersubaru             NA         NA      NA       NA    
## carManufacturertoyota     -4.033e+03  1.437e+03  -2.806 0.005652 ** 
## carManufacturervolkswagen -2.923e+03  1.682e+03  -1.738 0.084173 .  
## carManufacturervolvo      -3.055e+03  1.862e+03  -1.641 0.102835    
## citympg                    1.076e+02  6.651e+01   1.618 0.107634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1816 on 158 degrees of freedom
## Multiple R-squared:   0.96,  Adjusted R-squared:  0.9483 
## F-statistic: 82.35 on 46 and 158 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(lm.mpg2)

par(mfrow = c(1, 1))

# print(car::vif(lm.mpg2))

BE note: vif raises an error: Error in vif.default(lm.mpg2): there are aliased coefficients in the model

This (& plots) indicate:

  • Perfect collinearity - model is over specified
  • Heteroscedasticity from residuals vs fitted (general & 67, 17 and 75)
  • Leverage points (50)

Model gives p-value of 0.107634, which suggests effect of citympg is not significant, but errors stated above indicate/suggest model is not valid

3.1.1 Attempting to address these issues

# use log(price)
lm.mpg2log <- lm(
  log(price) ~ safetyIncr + aspiration + carbody +
    enginelocation + carwidth + curbweight + enginetype +
    cylinderNum + enginesize + stroke + peakrpm + compressionratio +
    carManufacturer + citympg,
  data = dfm
)
summary(lm.mpg2log)
## 
## Call:
## lm(formula = log(price) ~ safetyIncr + aspiration + carbody + 
##     enginelocation + carwidth + curbweight + enginetype + cylinderNum + 
##     enginesize + stroke + peakrpm + compressionratio + carManufacturer + 
##     citympg, data = dfm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28965 -0.06365  0.00171  0.06179  0.43451 
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.325e+00  7.684e-01   6.930 1.01e-10 ***
## safetyIncr1               -6.262e-02  4.669e-02  -1.341 0.181786    
## safetyIncr2               -5.709e-02  4.771e-02  -1.197 0.233273    
## safetyIncr3               -2.135e-02  5.064e-02  -0.422 0.673873    
## safetyIncr4                1.902e-03  6.139e-02   0.031 0.975318    
## aspirationturbo            1.202e-01  3.333e-02   3.605 0.000418 ***
## carbodyhardtop            -1.210e-01  7.840e-02  -1.544 0.124627    
## carbodyhatchback          -1.864e-01  6.865e-02  -2.715 0.007362 ** 
## carbodysedan              -1.423e-01  7.170e-02  -1.984 0.048980 *  
## carbodywagon              -2.277e-01  7.530e-02  -3.024 0.002912 ** 
## enginelocationrear         6.313e-01  2.049e-01   3.080 0.002440 ** 
## carwidth                   4.114e-02  1.276e-02   3.224 0.001537 ** 
## curbweight                 4.517e-04  8.847e-05   5.105 9.38e-07 ***
## enginetypedohcv           -1.778e-01  2.333e-01  -0.762 0.447128    
## enginetypel               -3.669e-01  1.136e-01  -3.230 0.001507 ** 
## enginetypeohc             -2.060e-02  6.765e-02  -0.304 0.761155    
## enginetypeohcf            -3.078e-01  1.042e-01  -2.954 0.003614 ** 
## enginetypeohcv            -8.030e-02  7.784e-02  -1.032 0.303855    
## enginetyperotor           -4.276e-01  2.210e-01  -1.935 0.054761 .  
## cylinderNumfour            1.181e-01  9.000e-02   1.312 0.191259    
## cylinderNumgeq_eight       1.301e-01  1.447e-01   0.899 0.370010    
## cylinderNumleq_three       7.171e-01  2.305e-01   3.111 0.002212 ** 
## cylinderNumsix             1.295e-01  1.040e-01   1.246 0.214712    
## enginesize                 1.932e-03  1.095e-03   1.765 0.079549 .  
## stroke                    -3.219e-02  5.915e-02  -0.544 0.587038    
## peakrpm                    7.092e-05  3.846e-05   1.844 0.067016 .  
## compressionratio           1.544e-03  4.046e-03   0.381 0.703370    
## carManufactureraudi        7.558e-02  1.406e-01   0.538 0.591579    
## carManufacturerbmw         2.871e-01  1.282e-01   2.240 0.026486 *  
## carManufacturerbuick      -3.798e-02  1.396e-01  -0.272 0.786005    
## carManufacturerchevrolet  -2.375e-01  1.377e-01  -1.725 0.086454 .  
## carManufacturerdodge      -2.841e-01  1.110e-01  -2.561 0.011388 *  
## carManufacturerhonda      -2.033e-01  1.172e-01  -1.735 0.084623 .  
## carManufacturerisuzu      -1.114e-01  1.174e-01  -0.949 0.344305    
## carManufacturerjaguar     -2.321e-01  1.442e-01  -1.610 0.109381    
## carManufacturermazda      -1.627e-01  1.013e-01  -1.607 0.110028    
## carManufacturermercury    -1.597e-01  1.631e-01  -0.979 0.328936    
## carManufacturermitsubishi -3.505e-01  1.105e-01  -3.170 0.001828 ** 
## carManufacturernissan     -2.060e-01  9.972e-02  -2.066 0.040505 *  
## carManufacturerpeugeot            NA         NA      NA       NA    
## carManufacturerplymouth   -3.109e-01  1.130e-01  -2.750 0.006647 ** 
## carManufacturerporsche     1.796e-01  1.660e-01   1.082 0.280897    
## carManufacturerrenault    -2.921e-01  1.407e-01  -2.076 0.039537 *  
## carManufacturersaab       -8.446e-02  1.117e-01  -0.756 0.450590    
## carManufacturersubaru             NA         NA      NA       NA    
## carManufacturertoyota     -2.357e-01  9.422e-02  -2.501 0.013398 *  
## carManufacturervolkswagen -1.781e-01  1.102e-01  -1.616 0.108162    
## carManufacturervolvo      -1.712e-01  1.220e-01  -1.403 0.162601    
## citympg                   -4.080e-03  4.360e-03  -0.936 0.350801    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1191 on 158 degrees of freedom
## Multiple R-squared:  0.9567, Adjusted R-squared:  0.9441 
## F-statistic: 75.96 on 46 and 158 DF,  p-value: < 2.2e-16
lm.mpg2logaic <- stepAIC(lm.mpg2log, direction = "backward", trace = FALSE)
print(summary(lm.mpg2logaic))
## 
## Call:
## lm(formula = log(price) ~ aspiration + carbody + carwidth + curbweight + 
##     enginetype + enginesize + peakrpm + carManufacturer + citympg, 
##     data = dfm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28430 -0.06153  0.00325  0.06208  0.44761 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.388e+00  7.113e-01   7.575 2.33e-12 ***
## aspirationturbo            1.194e-01  2.940e-02   4.060 7.53e-05 ***
## carbodyhardtop            -1.748e-01  7.132e-02  -2.451 0.015277 *  
## carbodyhatchback          -2.149e-01  6.319e-02  -3.400 0.000842 ***
## carbodysedan              -1.650e-01  6.231e-02  -2.648 0.008870 ** 
## carbodywagon              -2.402e-01  6.640e-02  -3.618 0.000393 ***
## carwidth                   4.182e-02  1.181e-02   3.542 0.000516 ***
## curbweight                 4.432e-04  7.213e-05   6.144 5.71e-09 ***
## enginetypedohcv           -2.518e-01  1.764e-01  -1.428 0.155191    
## enginetypel                1.857e-01  1.613e-01   1.151 0.251343    
## enginetypeohc             -4.827e-02  5.443e-02  -0.887 0.376485    
## enginetypeohcf             3.230e-01  1.581e-01   2.043 0.042610 *  
## enginetypeohcv            -6.121e-02  6.296e-02  -0.972 0.332350    
## enginetyperotor            2.113e-01  9.566e-02   2.209 0.028509 *  
## enginesize                 2.087e-03  8.213e-04   2.541 0.011970 *  
## peakrpm                    5.520e-05  3.476e-05   1.588 0.114183    
## carManufactureraudi       -2.622e-02  1.142e-01  -0.230 0.818649    
## carManufacturerbmw         3.169e-01  1.103e-01   2.872 0.004612 ** 
## carManufacturerbuick      -8.363e-02  1.117e-01  -0.749 0.454916    
## carManufacturerchevrolet  -2.064e-01  1.292e-01  -1.597 0.112112    
## carManufacturerdodge      -2.694e-01  1.024e-01  -2.630 0.009325 ** 
## carManufacturerhonda      -1.795e-01  1.051e-01  -1.709 0.089367 .  
## carManufacturerisuzu      -8.925e-02  1.101e-01  -0.810 0.418946    
## carManufacturerjaguar     -2.546e-01  1.355e-01  -1.878 0.062098 .  
## carManufacturermazda      -1.488e-01  9.609e-02  -1.548 0.123488    
## carManufacturermercury    -1.556e-01  1.552e-01  -1.002 0.317595    
## carManufacturermitsubishi -3.298e-01  1.005e-01  -3.281 0.001261 ** 
## carManufacturernissan     -2.030e-01  9.345e-02  -2.172 0.031255 *  
## carManufacturerpeugeot    -5.441e-01  1.891e-01  -2.878 0.004532 ** 
## carManufacturerplymouth   -2.935e-01  1.046e-01  -2.805 0.005621 ** 
## carManufacturerporsche     2.475e-01  1.591e-01   1.556 0.121594    
## carManufacturerrenault    -3.048e-01  1.275e-01  -2.390 0.017981 *  
## carManufacturersaab       -5.038e-02  1.014e-01  -0.497 0.619957    
## carManufacturersubaru     -6.205e-01  1.849e-01  -3.356 0.000978 ***
## carManufacturertoyota     -2.127e-01  8.774e-02  -2.424 0.016412 *  
## carManufacturervolkswagen -1.725e-01  9.773e-02  -1.765 0.079457 .  
## carManufacturervolvo      -1.111e-01  1.077e-01  -1.032 0.303519    
## citympg                   -4.162e-03  3.027e-03  -1.375 0.171028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1184 on 167 degrees of freedom
## Multiple R-squared:  0.9548, Adjusted R-squared:  0.9448 
## F-statistic: 95.36 on 37 and 167 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(lm.mpg2logaic)

par(mfrow = c(1, 1))
print(car::vif(lm.mpg2logaic))
##                         GVIF Df GVIF^(1/(2*Df))
## aspiration      1.870544e+00  1        1.367678
## carbody         4.724866e+00  4        1.214224
## carwidth        9.343089e+00  1        3.056647
## curbweight      2.053810e+01  1        4.531898
## enginetype      7.457299e+03  6        2.102399
## enginesize      1.702941e+01  1        4.126670
## peakrpm         4.001969e+00  1        2.000492
## carManufacturer 1.198908e+05 21        1.321063
## citympg         5.710770e+00  1        2.389722
print(anova(lm.mpg2logaic, lm.mpg2log))
## Analysis of Variance Table
## 
## Model 1: log(price) ~ aspiration + carbody + carwidth + curbweight + enginetype + 
##     enginesize + peakrpm + carManufacturer + citympg
## Model 2: log(price) ~ safetyIncr + aspiration + carbody + enginelocation + 
##     carwidth + curbweight + enginetype + cylinderNum + enginesize + 
##     stroke + peakrpm + compressionratio + carManufacturer + citympg
##   Res.Df  RSS Df Sum of Sq      F Pr(>F)
## 1    167 2.34                           
## 2    158 2.24  9  0.099951 0.7833  0.632

BE notes:

  • Heteroscedasticity is addressed (residuals vs fitted)
  • Adjusted r-squared is high (0.9448), model explains 94% of variance in the logarithm of the price
  • All VIF values are below threshold of 5 (although curbweight is slightly high)
  • Anova shows simplier model is sufficient -> this is the one we should use

Linking back to original question: “How does fuel economy (citympg) affect a car’s price, after controlling for other characteristics?”

Linking back to original question: “How does fuel economy (citympg) affect a car’s price, after controlling for other characteristics?”

Coefficients

Term Estimate Std. Error t value Pr(> t )
citympg -4.162e-03 3.027e-03 -1.375 0.171028

Answer:

  • Estimate: \(-4.163\times10^{-3}\)
  • p-value: \(0.171\)

So citympg does not have a statistically significant effect on log(price) after controlling for other factors (like curbweight, enginesize, carwidth, and carManufacturer)

Answer:

  • Estimate: \(-4.163\times10^{-3}\)
  • p-value: \(0.171\)

So citympg does not have a statistically significant effect on log(price) after controlling for other factors (like curbweight, enginesize, carwidth, and carManufacturer)

3.2 Attempt 02

Note here Ardi’s great model looks to address the question: ‘What car features predict a car’s fuel efficiency value?’

BE note: uses log(highwaympg/price) which represents MPG per dollar (or fuel efficiency value)

lm.mpg3 <- lm(
  formula = log(highwaympg / price) ~ carbody + enginetype +
    carwidth + curbweight + peakrpm + horsepower + carManufacturer,
  data = df
)

summary(lm.mpg3)
## 
## Call:
## lm(formula = log(highwaympg/price) ~ carbody + enginetype + carwidth + 
##     curbweight + peakrpm + horsepower + carManufacturer, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34331 -0.08570  0.00625  0.08940  0.31795 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.737e+00  8.655e-01  -2.007 0.046319 *  
## carbodyhardtop             2.046e-01  8.749e-02   2.339 0.020501 *  
## carbodyhatchback           2.633e-01  7.682e-02   3.428 0.000764 ***
## carbodysedan               2.191e-01  7.577e-02   2.892 0.004330 ** 
## carbodywagon               2.809e-01  8.026e-02   3.500 0.000594 ***
## enginetypedohcv            1.069e+00  2.271e-01   4.706 5.24e-06 ***
## enginetypel               -1.263e-01  1.978e-01  -0.638 0.524136    
## enginetypeohc             -7.980e-03  6.583e-02  -0.121 0.903655    
## enginetypeohcf            -9.385e-02  1.956e-01  -0.480 0.632014    
## enginetypeohcv             2.543e-02  7.151e-02   0.356 0.722621    
## enginetyperotor           -3.877e-01  1.024e-01  -3.785 0.000213 ***
## carwidth                  -2.666e-02  1.422e-02  -1.876 0.062432 .  
## curbweight                -7.130e-04  7.675e-05  -9.290  < 2e-16 ***
## peakrpm                   -7.619e-05  3.864e-05  -1.972 0.050277 .  
## horsepower                -5.866e-03  8.125e-04  -7.219 1.69e-11 ***
## carManufactureraudi       -1.322e-01  1.400e-01  -0.944 0.346274    
## carManufacturerbmw        -3.048e-01  1.291e-01  -2.362 0.019328 *  
## carManufacturerbuick      -1.186e-01  1.342e-01  -0.884 0.378130    
## carManufacturerchevrolet   3.625e-01  1.517e-01   2.390 0.017955 *  
## carManufacturerdodge       2.315e-01  1.219e-01   1.899 0.059286 .  
## carManufacturerhonda       1.909e-01  1.238e-01   1.543 0.124789    
## carManufacturerisuzu       1.151e-01  1.319e-01   0.873 0.383904    
## carManufacturerjaguar      2.716e-01  1.444e-01   1.882 0.061615 .  
## carManufacturermazda       1.028e-01  1.168e-01   0.880 0.380104    
## carManufacturermercury     2.441e-01  1.921e-01   1.271 0.205472    
## carManufacturermitsubishi  2.865e-01  1.199e-01   2.389 0.017989 *  
## carManufacturernissan      2.316e-01  1.117e-01   2.074 0.039618 *  
## carManufacturerpeugeot     3.178e-01  2.355e-01   1.350 0.178909    
## carManufacturerplymouth    2.727e-01  1.246e-01   2.189 0.029974 *  
## carManufacturerporsche    -1.621e-01  1.935e-01  -0.838 0.403178    
## carManufacturerrenault     2.260e-01  1.547e-01   1.461 0.145794    
## carManufacturersaab        7.343e-02  1.245e-01   0.590 0.556121    
## carManufacturersubaru      2.231e-01  2.294e-01   0.973 0.332175    
## carManufacturertoyota      2.229e-01  1.061e-01   2.101 0.037135 *  
## carManufacturervolkswagen  1.620e-01  1.181e-01   1.371 0.172220    
## carManufacturervolvo       9.926e-02  1.297e-01   0.765 0.445099    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1457 on 169 degrees of freedom
## Multiple R-squared:  0.9639, Adjusted R-squared:  0.9564 
## F-statistic:   129 on 35 and 169 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(lm.mpg3)

print(vif(lm.mpg3))
##                         GVIF Df GVIF^(1/(2*Df))
## carbody             4.260997  4        1.198640
## enginetype       4671.186671  6        2.022021
## carwidth            8.943383  1        2.990549
## curbweight         15.354936  1        3.918537
## peakrpm             3.266314  1        1.807295
## horsepower          9.926502  1        3.150635
## carManufacturer 50156.872654 21        1.293936

3.2.1 Problem

BE note: Is the fact that we have the message “not plotting observations with leverage one: 19, 76, 126, 130” a problem?

lm.mpg3aic <- stepAIC(lm.mpg3, direction = "backward", trace = TRUE)
## Start:  AIC=-757.45
## log(highwaympg/price) ~ carbody + enginetype + carwidth + curbweight + 
##     peakrpm + horsepower + carManufacturer
## 
##                   Df Sum of Sq    RSS     AIC
## <none>                         3.5855 -757.45
## - carwidth         1   0.07464 3.6601 -755.23
## - peakrpm          1   0.08248 3.6679 -754.79
## - carbody          4   0.32781 3.9133 -747.52
## - enginetype       6   1.16168 4.7471 -711.92
## - horsepower       1   1.10571 4.6912 -704.35
## - carManufacturer 21   2.81832 6.4038 -680.55
## - curbweight       1   1.83111 5.4166 -674.88
print(summary(lm.mpg3aic))
## 
## Call:
## lm(formula = log(highwaympg/price) ~ carbody + enginetype + carwidth + 
##     curbweight + peakrpm + horsepower + carManufacturer, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34331 -0.08570  0.00625  0.08940  0.31795 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.737e+00  8.655e-01  -2.007 0.046319 *  
## carbodyhardtop             2.046e-01  8.749e-02   2.339 0.020501 *  
## carbodyhatchback           2.633e-01  7.682e-02   3.428 0.000764 ***
## carbodysedan               2.191e-01  7.577e-02   2.892 0.004330 ** 
## carbodywagon               2.809e-01  8.026e-02   3.500 0.000594 ***
## enginetypedohcv            1.069e+00  2.271e-01   4.706 5.24e-06 ***
## enginetypel               -1.263e-01  1.978e-01  -0.638 0.524136    
## enginetypeohc             -7.980e-03  6.583e-02  -0.121 0.903655    
## enginetypeohcf            -9.385e-02  1.956e-01  -0.480 0.632014    
## enginetypeohcv             2.543e-02  7.151e-02   0.356 0.722621    
## enginetyperotor           -3.877e-01  1.024e-01  -3.785 0.000213 ***
## carwidth                  -2.666e-02  1.422e-02  -1.876 0.062432 .  
## curbweight                -7.130e-04  7.675e-05  -9.290  < 2e-16 ***
## peakrpm                   -7.619e-05  3.864e-05  -1.972 0.050277 .  
## horsepower                -5.866e-03  8.125e-04  -7.219 1.69e-11 ***
## carManufactureraudi       -1.322e-01  1.400e-01  -0.944 0.346274    
## carManufacturerbmw        -3.048e-01  1.291e-01  -2.362 0.019328 *  
## carManufacturerbuick      -1.186e-01  1.342e-01  -0.884 0.378130    
## carManufacturerchevrolet   3.625e-01  1.517e-01   2.390 0.017955 *  
## carManufacturerdodge       2.315e-01  1.219e-01   1.899 0.059286 .  
## carManufacturerhonda       1.909e-01  1.238e-01   1.543 0.124789    
## carManufacturerisuzu       1.151e-01  1.319e-01   0.873 0.383904    
## carManufacturerjaguar      2.716e-01  1.444e-01   1.882 0.061615 .  
## carManufacturermazda       1.028e-01  1.168e-01   0.880 0.380104    
## carManufacturermercury     2.441e-01  1.921e-01   1.271 0.205472    
## carManufacturermitsubishi  2.865e-01  1.199e-01   2.389 0.017989 *  
## carManufacturernissan      2.316e-01  1.117e-01   2.074 0.039618 *  
## carManufacturerpeugeot     3.178e-01  2.355e-01   1.350 0.178909    
## carManufacturerplymouth    2.727e-01  1.246e-01   2.189 0.029974 *  
## carManufacturerporsche    -1.621e-01  1.935e-01  -0.838 0.403178    
## carManufacturerrenault     2.260e-01  1.547e-01   1.461 0.145794    
## carManufacturersaab        7.343e-02  1.245e-01   0.590 0.556121    
## carManufacturersubaru      2.231e-01  2.294e-01   0.973 0.332175    
## carManufacturertoyota      2.229e-01  1.061e-01   2.101 0.037135 *  
## carManufacturervolkswagen  1.620e-01  1.181e-01   1.371 0.172220    
## carManufacturervolvo       9.926e-02  1.297e-01   0.765 0.445099    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1457 on 169 degrees of freedom
## Multiple R-squared:  0.9639, Adjusted R-squared:  0.9564 
## F-statistic:   129 on 35 and 169 DF,  p-value: < 2.2e-16
par(bg = "white")
par(mfrow = c(2, 2))
plot(lm.mpg3aic)

vif(lm.mpg3aic)
##                         GVIF Df GVIF^(1/(2*Df))
## carbody             4.260997  4        1.198640
## enginetype       4671.186671  6        2.022021
## carwidth            8.943383  1        2.990549
## curbweight         15.354936  1        3.918537
## peakrpm             3.266314  1        1.807295
## horsepower          9.926502  1        3.150635
## carManufacturer 50156.872654 21        1.293936

Summary

  • Model explains ~95% of variance in fuel efficiency value
  • Heteroscedasticity: residuals vs fitted looks good -> no heteroscedasticity
  • Collinearity values are all below threshold of 5

Not sure if we want to mention it’s flawed due to leverage points and then tell story.

Story is quite clear though:

  • curbweight and horsepower are the strongest predictors with significant negative coefficients -> makes sense:
    • heavier (up) -> mpg per dollar (down)
    • more powerful (horsepower up) -> mpg per dollar (down)

Rotor engine has strong negative coefficient -> indicating it has a poor fuel efficiency value -> consistent with prior knowledge

3.3 Limitations

TBI

3.4 Suggestions for Improvements

TBI